library(tidyverse); library(cowplot)
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.2
library(phyloseq); library(decontam)
library(plotly); library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.1

1 Merged data from QIIME2 analysis

Run below to repeat ASV compilation from raw QIIME2 output.

# merged_tax <- read_delim("/Users/sarahhu/Desktop/Projects/microeuks_deepbiosphere_datamine/microeuk-amplicon-survey/data-input/taxonomy.tsv", delim = "\t")
# merged_asv <- read_delim("/Users/sarahhu/Desktop/Projects/microeuks_deepbiosphere_datamine/microeuk-amplicon-survey/data-input/microeuk-merged-asv-table.tsv", delim = "\t", skip = 1)
# metadata <- read.delim("/Users/sarahhu/Desktop/Projects/microeuks_deepbiosphere_datamine/microeuk-amplicon-survey/data-input/samplelist-metadata.txt")
# 
# asv_wtax <- merged_asv %>%
#   select(FeatureID = '#OTU ID', everything()) %>%
#   pivot_longer(cols = !FeatureID,
#                names_to = "SAMPLE", values_to = "value") %>%
#   left_join(merged_tax, by = c("FeatureID" = "Feature ID")) %>%
#   left_join(metadata) %>%
#   filter(SITE == "GordaRidge" | SITE == "substrate" | SITE == "Laboratory") %>%
#   filter(!grepl("Siders_", SAMPLE)) %>%
#   filter(!(grepl("T0", SAMPLE))) %>%
#   filter(!(grepl("T24", SAMPLE))) %>%
#   filter(!(grepl("T36", SAMPLE))) %>%
#   mutate(DATASET = case_when(
#     grepl("_GR_", SAMPLE) ~ "GR",
#     grepl("Gorda", SAMPLE) ~ "GR",
#     grepl("_MCR_", SAMPLE) ~ "MCR",
#     grepl("Axial", SAMPLE) ~ "Axial",
#   TRUE ~ "Control or blank")) %>%
#     separate(Taxon, c("Domain", "Supergroup",
#                   "Phylum", "Class", "Order",
#                   "Family", "Genus", "Species"), sep = ";", remove = FALSE)
# 
# # fix naming, some controls sequenced separately.
# gr_substrate_fluid_asvs <- asv_wtax %>%
#   mutate(SAMPLE_tmp = case_when(
#     Sample_actual == "" ~ SAMPLE,
#     TRUE ~ Sample_actual
#   )) %>%
#   select(-SAMPLE) %>%
#   select(SAMPLE = SAMPLE_tmp, everything()) %>%
#   filter(value > 0)

# View(gr_substrate_fluid_asvs)
# View(gr_substrate_fluid_asvs %>% filter(Sample_or_Control == "Control") %>% select(SAMPLE) %>% distinct())
# save(gr_substrate_fluid_asvs, file = "/Users/sarahhu/Desktop/Projects/GordaRidge-microcolonizers/microcolonizers-GordaRidge/input-data/asv-table.RData")

2 Background information

Microcolonizers (or ‘cones’) were deployed at the Gorda Ridge hydrothermal vent field. Each microcolonizer was placed over a region of visible diffuse fluid flow. A total of 6 microcolonizers were depeloyed at one time, pairs of experiments were picked up after 6, 7, and 8 days.

Each microcolonizer chamber had 6 different substrates, so that diffuse fluid could reach each substrate. Temperature loggers also recorded temperature for the duration of the deployments. Substrates included: shell, riftia shell, quartz, pyrite, basalt, and olivine.

Microcolonizers at Mt. Edwards vent site. Credit: Ocean Exploration Trust

Microcolonizers at Mt. Edwards vent site. Credit: Ocean Exploration Trust

Recovering microcolonizers with ROV Hercules. Credit: Ocean Exploration Trust

Recovering microcolonizers with ROV Hercules. Credit: Ocean Exploration Trust

Upon recovery of each experiment, substrates were saved for microscopy and molecular analysis. For the sequence data below (shell, quartz, and riftia), RNA was extracted, cDNA was created and the V4 18S rRNA hypervariable region was amplified and sequenced. Blank substrates (which sat with milliQ during the shipboard processing) were also sequenced alongside the experimental treatments.

Opening up microcolonizers after recovery. Credit: Mirmalek

Opening up microcolonizers after recovery. Credit: Mirmalek

3 Import microcolonizer data

load("input-data/asv-table.RData", verbose = TRUE)
## Loading objects:
##   gr_substrate_fluid_asvs

3.0.1 Quick look - unprocessed data

Barplots to show total number of sequences and total number of ASVs

plot_grid(gr_substrate_fluid_asvs %>% 
  filter(value > 0) %>% 
  ggplot(aes(x = SAMPLE)) +
  geom_bar(stat = "count", width = 0.9) +
  labs(y = "Total ASVs") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")),
  gr_substrate_fluid_asvs %>% 
  group_by(SAMPLE, SITE, Domain, DATASET) %>% 
  summarise(SUM_SEQ_DOMAIN = sum(value)) %>% 
  ggplot(aes(x = SAMPLE, y = SUM_SEQ_DOMAIN, fill = Domain)) +
  geom_bar(stat = "identity", color = "black", width = 0.9) +
  labs(y = "Total Sequences") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "bottom"),
  ncol = 2)
## `summarise()` has grouped output by 'SAMPLE', 'SITE', 'Domain'. You can
## override using the `.groups` argument.

3.0.2 Table reporting total ASVs and sequences

# View(gr_substrate_fluid_asvs %>% filter(value > 0) %>% 
#        group_by(SAMPLE, DATASET, SITE) %>% 
#        summarise(SEQ_SUM = sum(value),
#                  ASV_COUNT = n()))

3.1 Decontaminate sequence library

Import sample description text file, import as phyloseq library, and remove potential contaminate ASVs and sequences. Catalog total number of ASVs and sequences removed from analysis.

3.1.1 Import as phyloseq objects

# head(gr_substrate_fluid_asvs)
tax_matrix <- gr_substrate_fluid_asvs %>% 
  select(FeatureID, Taxon) %>% 
  distinct() %>% 
  separate(Taxon, c("Domain", "Supergroup", 
                  "Phylum", "Class", "Order",
                  "Family", "Genus", "Species"), sep = ";", remove = FALSE) %>% 
  column_to_rownames(var = "FeatureID") %>% 
  as.matrix
## Warning: Expected 8 pieces. Additional pieces discarded in 3115 rows [3, 4, 6,
## 9, 11, 12, 13, 16, 21, 25, 28, 30, 33, 38, 42, 44, 47, 49, 50, 52, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 2988 rows [1, 2,
## 5, 7, 8, 10, 14, 15, 17, 18, 19, 20, 22, 23, 24, 26, 27, 29, 31, 32, ...].
asv_matrix <- gr_substrate_fluid_asvs %>% 
  select(FeatureID, SAMPLE, value) %>% 
  pivot_wider(names_from = "SAMPLE", values_fill = 0, values_from = value) %>% 
  column_to_rownames(var = "FeatureID") %>% 
  as.matrix

# Align row names for each matrix
rownames(tax_matrix) <- row.names(asv_matrix)

metadata_cones <- gr_substrate_fluid_asvs %>% 
  select(SAMPLE, Type, VENT, SITE, SAMPLETYPE, Sample_or_Control) %>% 
  distinct() %>% 
  column_to_rownames(var = "SAMPLE")
# Import asv and tax matrices
ASV = otu_table(asv_matrix, taxa_are_rows = TRUE)
TAX = tax_table(tax_matrix)
phylo_obj <- phyloseq(ASV, TAX)

# Import metadata as sample data in phyloseq
samplenames <- sample_data(metadata_cones)

# join as phyloseq object
physeq_wnames = merge_phyloseq(phylo_obj, samplenames)
# colnames(ASV)

## Check
# physeq_wnames
# View(gr_substrate_fluid_asvs %>% filter(value > 0))

3.1.2 Identify contaminant ASVs

In addition to shipboard milliQ blank samples, each substrate type had a ‘blank’ control, which was sampled at the same time, but never deployed in the microcolonizers (only processed at the same time in the lab).

# When "Control" appears in "Sample_or_Control column, this is a negative control"
sample_data(physeq_wnames)$is.neg <- sample_data(physeq_wnames)$Sample_or_Control == "Control"
# ID contaminants using Prevalence information
contam_prev <- isContaminant(physeq_wnames, 
                               method="prevalence", 
                               neg="is.neg", 
                               threshold = 0.5, normalize = TRUE) 

# Report number of ASVs IDed as contamintants
table(contam_prev$contaminant)
## 
## FALSE  TRUE 
##  6060    43

0.5 - this threshold will ID contaminants in all samples that are more prevalent in negative controls than in positive samples.

3.1.3 Remove problematic ASVs

# Subset contaminant ASVs
contams <- filter(contam_prev, contaminant == "TRUE")
list_of_contam_asvs <- as.character(row.names(contams))
# length(list_of_contam_asvs)

taxa_contam <- as.data.frame(tax_matrix) %>% 
  rownames_to_column(var = "FeatureID") %>% 
  filter(FeatureID %in% list_of_contam_asvs)
# head(taxa_contam)
# View(asv_wtax)
asv_wtax_decon <- gr_substrate_fluid_asvs %>% 
  filter(!(FeatureID %in% list_of_contam_asvs)) %>% 
  filter(!(Sample_or_Control == "Control"))

tmp_orig <- (gr_substrate_fluid_asvs %>% filter(!(Sample_or_Control == "Control")))

# Stats on lost
x <- length(unique(tmp_orig$FeatureID)); x
## [1] 5912
y <- length(unique(asv_wtax_decon$FeatureID)); y
## [1] 5886
100*((y-x)/x) #0.43% of ASVs lost
## [1] -0.4397835
a <- sum(tmp_orig$value);a #3.1 million
## [1] 3155152
b <- sum(asv_wtax_decon$value);b #2.89 million 
## [1] 2981359
100*((b-a)/a)
## [1] -5.508229
# Lost 5.5% of sequences from whole dataset.

## Subsample to clean ASVs
asv_wtax_wstats <- gr_substrate_fluid_asvs %>% 
  mutate(DECONTAM = case_when(
    FeatureID %in% list_of_contam_asvs ~ "FAIL",
    TRUE ~ "PASS"
  ))
plot_grid(asv_wtax_wstats %>% 
  filter(value > 0) %>% 
  ggplot(aes(x = SAMPLE, fill = DECONTAM)) +
  geom_bar(stat = "count", width = 0.9, color = "black") +
  labs(y = "Total ASVs") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "bottom"),
  asv_wtax_wstats %>% 
  group_by(SAMPLE, SITE, DECONTAM, DATASET) %>% 
  summarise(SUM_SEQ_DOMAIN = sum(value)) %>% 
  ggplot(aes(x = SAMPLE, y = SUM_SEQ_DOMAIN, fill = DECONTAM)) +
  geom_bar(stat = "identity", color = "black", width = 0.9) +
  labs(y = "Total Sequences") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "bottom"),
  ncol = 2)
## `summarise()` has grouped output by 'SAMPLE', 'SITE', 'DECONTAM'. You can
## override using the `.groups` argument.

4 Subset Gorda Ridge Cones and in situ

Isolate Gorda Ridge substrate samples and the in situ samples from Mt Edwards, plume at Mt Edwards and the background seawater. Remove in situ

# colnames(asv_wtax_wstats)
# unique(asv_wtax_wstats$Sample_actual)
# Isolate blank substrate for GR cones

asv_GR <- asv_wtax_wstats %>% 
      filter(Sample_or_Control != "Control") %>% # rm controls
      filter(SITE == "substrate" | VENT == "Mt Edwards" | 
                VENT == "Mt Edwards Plume" | VENT == "Deep seawater" | VENT == "Shallow seawater") %>% 
      filter(DATASET != "Axial", DATASET != "MCR") %>% 
      filter(!grepl("_expTf_", SAMPLE)) %>% 
      filter(value > 0) %>% 
      filter(DECONTAM == "PASS") #%>% 
      # bind_rows(asv_substrate_blanks)

# Get quick stats on totals
sum(asv_GR$value) # 2.3 million sequences
## [1] 2366302
length(unique(asv_GR$FeatureID)) #3370 ASVs
## [1] 3370
# View(asv_GR)
# unique(asv_GR$VENT)

4.1 Start here for analyses.

# save(asv_GR, file = "input-data/asv-table-processed.RData")
# load("input-data/asv-table-processed.RData", verbose = TRUE)

Additional sample QC, check replicates, and determine if replicates should be averaged.

plot_grid(asv_GR %>% 
  group_by(SAMPLE, VENT, DATASET, SAMPLETYPE, Domain) %>% 
  summarise(seqsum_var = sum(value),
            asvcount_var = n()) %>% 
  pivot_longer(ends_with("_var"), names_to = "VARIABLE") %>% 
  ggplot(aes(x = SAMPLE, y = value, fill = Domain)) +
    geom_bar(color = "black", stat = "identity", position = "fill") +
    facet_grid(VARIABLE ~ SAMPLETYPE, space = "free", scales = "free") +
  scale_y_continuous(expand = c(0,0)) +
  theme_linedraw() +
  scale_fill_brewer(palette = "Paired") +
  theme(strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(color = "black", angle = 90, hjust = 1, vjust = 0.5),
        legend.position = "bottom"),
  asv_GR %>% 
  group_by(SAMPLE, VENT, DATASET, SAMPLETYPE, Domain) %>% 
  summarise(seqsum_var = sum(value),
            asvcount_var = n()) %>% 
  pivot_longer(ends_with("_var"), names_to = "VARIABLE") %>% 
  ggplot(aes(x = SAMPLE, y = value, fill = Domain)) +
    geom_bar(color = "black", stat = "identity", position = "stack") +
    facet_grid(VARIABLE ~ SAMPLETYPE, space = "free_x", scales = "free") +
  scale_y_continuous(expand = c(0,0)) +
  theme_linedraw() +
  scale_fill_brewer(palette = "Paired") +
  theme(strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(color = "black", angle = 90, hjust = 1, vjust = 0.5),
        legend.position = "bottom"),
ncol = 2)
## `summarise()` has grouped output by 'SAMPLE', 'VENT', 'DATASET', 'SAMPLETYPE'.
## You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLE', 'VENT', 'DATASET', 'SAMPLETYPE'.
## You can override using the `.groups` argument.

Based on above plot, remove “BSW020” sterivex filter.

4.2 Taxonomy bar plots

Supergroup distribution by background fluid vs. what was isolated from substrates.

Factor supergroup names

tmp <- asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  # unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup))
colors_tax <- c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
    "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45", 
    "#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc", 
    "#807dba", "#54278f", "#bdbdbd", "black")

tax_names <- as.character(unique(tmp$Supergroup))
names(colors_tax) <- tax_names
# View(asv_GR)
by_supergroup_back <- asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  # unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET, Type) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup() %>% 
  group_by(SITE, SAMPLETYPE, VENT, Supergroup, Type) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  filter(SITE == "GordaRidge") %>% 
  ggplot(aes(x = Sample_substrate, y = SEQ_SUM, fill = Supergroup)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.6) +
    facet_grid(. ~ SITE + SAMPLETYPE, space = "free_x", scales = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")) +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = colors_tax) +
  labs(title = "Microcolonizers and background", x = "", y = "Relative sequence abundance")
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain',
## 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT',
## 'SITE', 'SAMPLETYPE', 'YEAR', 'DATASET'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'Supergroup'.
## You can override using the `.groups` argument.
by_supergroup_substrate <- asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  # unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET, Type) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup() %>% 
  group_by(SITE, SAMPLETYPE, VENT, Supergroup, Type) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  filter(SITE != "GordaRidge") %>% 
  ggplot(aes(x = Sample_substrate, y = SEQ_SUM, fill = Supergroup)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.6) +
    facet_grid(. ~ SITE + SAMPLETYPE, space = "free_x", scales = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")) +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = colors_tax) +
  labs(title = "Microcolonizers and background", x = "", y = "Relative sequence abundance")
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain',
## 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT',
## 'SITE', 'SAMPLETYPE', 'YEAR', 'DATASET'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'Supergroup'.
## You can override using the `.groups` argument.

4.2.1 Taxonomy plot of background fluid

plotly::ggplotly(by_supergroup_back)

4.2.2 Taxonomy plot of microcolonizers

plotly::ggplotly(by_supergroup_substrate)

4.2.3 Plot only metazoa

metazoa_substrate <- asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET, Type) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  # Opisthokonts only
  filter(Supergroup == "Opisthokonta") %>%
  unite(Phylum_Class, Phylum, Class, sep = "-") %>% 
  ungroup() %>% 
  group_by(SITE, SAMPLETYPE, VENT, Type, Supergroup, Phylum_Class) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  ggplot(aes(x = Sample_substrate, y = SEQ_SUM, fill = Phylum_Class)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.9) +
    facet_grid(. ~ SITE + SAMPLETYPE, scale = "free", space = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")) +
  scale_y_continuous(expand = c(0,0)) +
  # scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
  #   "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45", 
  #   "#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc", 
  #   "#807dba", "#54278f", "#bdbdbd", "black")) +
  labs(title = "Average across replicates", x = "", y = "Relative sequence abundance")
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain',
## 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT',
## 'SITE', 'SAMPLETYPE', 'YEAR', 'DATASET'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'Type',
## 'Supergroup'. You can override using the `.groups` argument.
plotly::ggplotly(metazoa_substrate)

5 What ASVs are shared across microcolonizers and substrate types?

library(ggupset)
head(asv_GR)
## # A tibble: 6 × 34
##   SAMPLE FeatureID value Taxon Domain Supergroup Phylum Class Order Family Genus
##   <chr>  <chr>     <dbl> <chr> <chr>  <chr>      <chr>  <chr> <chr> <chr>  <chr>
## 1 Gorda… 00096455…    91 Euka… Eukar… Rhizaria   Radio… Acan… <NA>  <NA>   <NA> 
## 2 Gorda… 00165708…     1 Euka… Eukar… Stramenop… Ochro… Pela… Pela… Pelag… Pela…
## 3 Gorda… 0030ad8c…    36 Euka… Eukar… Stramenop… Opalo… MAST… MAST… MAST-… MAST…
## 4 Gorda… 0038478b…    23 Euka… Eukar… Opisthoko… Metaz… Cnid… Cnid… Hydro… <NA> 
## 5 Gorda… 0038478b…   101 Euka… Eukar… Opisthoko… Metaz… Cnid… Cnid… Hydro… <NA> 
## 6 Gorda… 0038478b…    15 Euka… Eukar… Opisthoko… Metaz… Cnid… Cnid… Hydro… <NA> 
## # … with 23 more variables: Species <chr>, Consensus <dbl>, ref_num <int>,
## #   VENT <chr>, COORDINATES <chr>, SITE <chr>, Sample_or_Control <chr>,
## #   SAMPLEID <chr>, SAMPLETYPE <chr>, DEPTH <chr>, YEAR <dbl>, temp <dbl>,
## #   pH <dbl>, percseawater <dbl>, mg <dbl>, h2 <chr>, h2s <dbl>, ch4 <dbl>,
## #   ProkConc <chr>, Sample_actual <chr>, Type <chr>, DATASET <chr>,
## #   DECONTAM <chr>
unique(asv_GR$SAMPLETYPE)
## [1] "Background"     "Plume"          "Microcolonizer" "Vent"
# unique(asv_GR$Type)

What taxa are shared across sample types, background, plume, vent vs. microcolonizers?

asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Supergroup, 
           SAMPLETYPE) %>% 
    summarise(SUM = sum(value)) %>% 
  ungroup() %>%
  # unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  distinct(FeatureID, Supergroup, SUM, SAMPLETYPE, .keep_all = TRUE) %>% 
  group_by(FeatureID, Supergroup) %>% 
  summarise(sample_type = list(SAMPLETYPE)) %>% 
  ggplot(aes(x = sample_type)) +
    geom_bar(color = "black", width = 0.8, aes(fill = Supergroup)) +
    scale_x_upset(n_intersections = 30)
## `summarise()` has grouped output by 'FeatureID', 'Supergroup'. You can override
## using the `.groups` argument.
## `summarise()` has grouped output by 'FeatureID'. You can override using the
## `.groups` argument.

> Each sample type has a high number of unique ASVs present, ranging from 200 to almost 1,000 ASVs. The highest numbere of ASVs unique to microcolonizers included metazoa. Almost 200 ASVs were present in both diffuse vent fluid and the microcolonizer substrates. This spanned most lineages. Approximately 100 ASVs weere found in all environments.

What for taxa found in microcolonizers and the diffuse fluid, was there a difference in community distribution by substrate type?

asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  filter(SAMPLETYPE == "Microcolonizer" | SAMPLETYPE == "Vent") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Supergroup, 
           SAMPLETYPE, VENT) %>% 
    summarise(SUM = sum(value)) %>% 
  ungroup() %>%
  mutate(SUBSTRATE = case_when(
    SAMPLETYPE == "Microcolonizer" ~ VENT,
    SAMPLETYPE != "Microcolonizer" ~ "Diffuse fluid"
  )) %>% 
  # unite(Sample_substrate, VENT, Type, sep = " ") %>%
  distinct(FeatureID, Supergroup, SUM, SUBSTRATE, .keep_all = TRUE) %>% 
  group_by(FeatureID, Supergroup) %>% 
  summarise(sample_type = list(SUBSTRATE)) %>% 
  ggplot(aes(x = sample_type)) +
    geom_bar(color = "black", width = 0.8, aes(fill = Supergroup)) +
    scale_x_upset(n_intersections = 30)
## `summarise()` has grouped output by 'FeatureID', 'Supergroup', 'SAMPLETYPE'.
## You can override using the `.groups` argument.
## `summarise()` has grouped output by 'FeatureID'. You can override using the
## `.groups` argument.

> Shell was most distinct. This also lines up with the fact that the shell was difficult to work with in the lab. The quartz and diffuse fluid had the most similar compositions, followede by the riftia. Dinoflagellates comprised a high proportion of the ASVs unique to diffuse vent fluid, while dinos were comparatively less frequent on each substrate.

What taxa are shared across substrate types?

asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  filter(SAMPLETYPE == "Microcolonizer") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Supergroup, 
           VENT) %>% 
    summarise(SUM = sum(value)) %>% 
  ungroup() %>%
  # unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  # distinct(FeatureID, Supergroup, SUM, VENT, .keep_all = TRUE) %>% 
  group_by(FeatureID, Supergroup) %>% 
  summarise(substrate = list(VENT)) %>% 
  ggplot(aes(x = substrate)) +
    geom_bar(color = "black", width = 0.8, aes(fill = Supergroup)) +
    scale_x_upset(n_intersections = 30)
## `summarise()` has grouped output by 'FeatureID', 'Supergroup'. You can override
## using the `.groups` argument.
## `summarise()` has grouped output by 'FeatureID'. You can override using the
## `.groups` argument.

Taking a closer look at ASVs shared versus unique on different substrates, the shell was the most dissimilar - which was attributed to archaeplastida. Quartz and Riftia had over 250 ASVs unique to each substrate type only.

What ASVs were shared across the different microcolonizers by time deployed?

# unique(asv_GR$VENT)
asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(!(Type == "")) %>% 
  filter(Sample_or_Control != "Control") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Supergroup, Type) %>% 
    summarise(SUM = sum(value)) %>% 
  ungroup() %>%
  # unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  distinct(FeatureID, Supergroup, SUM, Type, .keep_all = TRUE) %>% 
  group_by(FeatureID, Supergroup) %>% 
  summarise(Sample_substrate = list(Type)) %>% 
  ggplot(aes(x = Sample_substrate)) +
    geom_bar(color = "black", width = 0.8, aes(fill = Supergroup)) +
    scale_x_upset(n_intersections = 30)
## `summarise()` has grouped output by 'FeatureID', 'Supergroup'. You can override
## using the `.groups` argument.
## `summarise()` has grouped output by 'FeatureID'. You can override using the
## `.groups` argument.
## Warning: Removed 24 rows containing non-finite values (stat_count).

No pattern.

6 Comparison of community composition

Is there a temporal trend in community composition?

# head(asv_GR)
asv_GR %>% 
  filter(SAMPLETYPE == "Microcolonizer") %>% 
  filter(!is.na(Supergroup)) %>% 
  filter(!grepl("Bacteria", Supergroup)) %>% 
  mutate(Days = case_when(
    Type == "MC1" ~ "7 days",
    Type == "MC2" ~ "6 days",
    Type == "MC3" ~ "6 days",
    Type == "MC4" ~ "7 days",
    Type == "MC5" ~ "8 days",
    Type == "MC6" ~ "8 days"
  )) %>% 
  select(Days, Type, VENT, FeatureID, value, Taxon, Supergroup) %>% 
  unite(MC_SUBSTRATE, Type, VENT, sep = " ", remove = FALSE) %>% 
  ggplot(aes(x = FeatureID, y = VENT, fill = value)) +
  geom_tile() +
  facet_grid(Days ~ Supergroup, space = "free", scale = "free") +
  theme_bw() +
  theme(axis.text.x = element_blank())

> Some temporal trend, where each substrate increases in species richness as the days deployed increases. This varied with respect to taxonomic group.

6.1 Cluster analysis

pre_clr <- asv_GR %>% 
  filter(SAMPLETYPE == "Microcolonizer") %>% 
  filter(!grepl("Bacteria", Supergroup)) %>% 
  mutate(Days = case_when(
    Type == "MC1" ~ "7 days",
    Type == "MC2" ~ "6 days",
    Type == "MC3" ~ "6 days",
    Type == "MC4" ~ "7 days",
    Type == "MC5" ~ "8 days",
    Type == "MC6" ~ "8 days"
  )) %>% 
  select(Days, Type, VENT, FeatureID, value) %>%
  unite(SAMPLE, Type, VENT, Days, sep = "_") %>% 
  pivot_wider(names_from = SAMPLE, values_from = value, values_fill = 0) %>% 
  column_to_rownames(var = "FeatureID")
  
# look at eigenvalues
pca_lr <- prcomp(data.frame(compositions::clr(t(pre_clr))))
variance_lr <- (pca_lr$sdev^2)/sum(pca_lr$sdev^2)
## View bar plot
barplot(variance_lr, main = "Log-Ratio PCA Screeplot", xlab = "PC Axis", ylab = "% Variance", 
    cex.names = 1.5, cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)

## Extract PCR points
data.frame(pca_lr$x, SAMPLE = rownames(pca_lr$x)) %>% 
      separate(SAMPLE, into = c("MC", "Substrate", "Days"), sep = "_") %>% 
  ## Generate PCA plot
  ggplot(aes(x = PC1, y = PC2, shape = Substrate, fill = Days)) +
    geom_hline(yintercept = 0) + geom_vline(xintercept = 0, color = "#525252") +
    geom_point(size=4, stroke = 1, aes(fill = Days)) +
    ylab(paste0('PC2 ',round(variance_lr[2]*100,2),'%')) +
    xlab(paste0('PC1 ',round(variance_lr[1]*100,2),'%')) +
    scale_shape_manual(values = c(21, 23, 24)) +
    theme_bw() +
    theme(axis.text = element_text(color="black", size=12),
          legend.title = element_blank(),
          axis.title = element_text(color="black", size=14),
          legend.text = element_text(color = "black", size = 14),
          plot.margin = margin(2, 1, 2, 1, "cm")) +
    guides(fill = guide_legend(override.aes = list(shape = 21) ),
              shape = guide_legend(override.aes = list(fill = "black")))

After 6 days, there were minor differences in the communities, but than at 7 and 8 days, there were more. This may not be a great indicator, as the microcolonizers collected on day 6 were both situated at higher temperatures. This shows that compared to the microcolonizers placed at sites of cooler diffuse fluid flow, substrates subject to warmer diffuse fluid had a more similar composition to one another.

7 Import temperature data and other information for each microcolonizer

ibuttons deployed with each microcolonizer.

sample_list <- read.delim("input-data/MC-samplelist.txt")
sample_metadata <- read.delim("input-data/MC-collect-info.txt")
amy <- c("Quartz1", "Basalt", "Olivine", "Pyrite")
sample_list_complete <- sample_list %>% 
  left_join(sample_metadata) %>% 
  separate(Sample_type, c("Sample_material", "Sample_type"), sep = "-") %>% 
  mutate(sample_purpose = case_when(
    (Sample_material %in% amy & Sample_type == "DNA") ~ "16S",
    (!(Sample_material %in% amy) & Sample_type == "DNA") ~ "18S",
    (Sample_material %in% amy & Sample_type == "SEM") ~ "Prok SEM",
    (!(Sample_material %in% amy) & Sample_type == "SEM") ~ "Euk SEM"
  ))
## Joining, by = "MC_ID"
# write_delim(sample_list_complete, path = "sample-list-MC.txt", delim = "\t")

7.1 Process ibutton data

list_mc <- read.delim("input-data/microcolonizer_list.txt")
head(list_mc)
##   ibuttons.num Deploy.date Deploy.time..UTC. Recover.date Nautilus.Sample.ID
## 1            1      29-May              8:30        5-Jun          NA108-076
## 2            2      29-May              8:30        4-Jun          NA108-045
## 3            3      29-May              8:30        4-Jun          NA108-046
## 4            4      29-May              8:30        5-Jun          NA108-077
## 5            5      29-May              8:30        6-Jun          NA108-097
## 6            6      29-May              8:30        6-Jun          NA108-098
##    Dive Vent.site.name Recover.time..UTC..start Recover.time..UTC..end     Lat
## 1 H1754     Mt Edwards                     1819                   1822 42.7547
## 2 H1753     Mt Edwards                     2004                   2011 42.7547
## 3 H1753     Mt Edwards                     2011                   2015 42.7547
## 4 H1754     Mt Edwards                     1823                   1824 42.7548
## 5 H1755     Mt Edwards                     2228                   2231 42.7547
## 6 H1755     Mt Edwards                     2231                   2231 42.7547
##       Long Depth..m. Temp..C..Herc Salinigyt.PSU Oxygen..μmoles.L.
## 1 -126.709  2706.768       1.79985       34.6189          64.63451
## 2 -126.709  2707.849       1.95566       34.6527          64.93543
## 3 -126.709  2707.938       4.84292       34.4767          66.27680
## 4 -126.709  2706.780       5.55486       34.6322          68.08677
## 5 -126.709  2707.796       3.29768       34.5240          65.41028
## 6 -126.709  2707.852       2.84660       34.5571          67.84714
##   Corrected.O2data..x0.813.
## 1                   52.5479
## 2                   52.7925
## 3                   53.8830
## 4                   55.3545
## 5                   53.1786
## 6                   55.1597

Deployment of all Microcolonizers was at 2019-05-29, 20:59:57.659 (UTC) (also 8:59 PM UTC), which is 4:59PM EST or 16:59 PM EST

Function to import all ibutton data raw and process.

# Recovered microcolonizers by ibutton IDs:
## 1, 2, 3, 4, 5, 6
recovered <- c("2019-06-05 14:19:00","2019-06-04 16:04:00","2019-06-04 16:11:00","2019-06-05 14:23:00","2019-06-06 18:28:00","2019-06-06 18:31:00")
# Sys.timezone()
# tmp_2 <- logger2 %>% add_column(MC = 2)
# 
# logger1 <- read.csv("input-data/Logger1Data.csv", skip = 19)
# logger2 <- read.csv("input-data/Logger2Data.csv", skip = 19)
# x <- 1
mc_ids <- c(1, 2, 3, 4, 5, 6)

for(num in mc_ids){
  log_file <- read.csv(paste("input-data/Logger", num, "Data.csv", sep = ""), skip = 19)  
  cat("Reading in log file number", num, "\n")
  log_out <- log_file %>% 
    add_column(MC = num) %>% 
    mutate(Parsed_time_EST = parse_date_time(Date.Time, "%m/%d/%y %H:%M:%S p", tz = "America/New_York")) %>%
    # Filter out irrelevant date before deployment
    filter(Parsed_time_EST > "2019-05-29 18:59:00") %>% 
    filter(
      Parsed_time_EST < recovered[[num]]
    )
  if(!exists("log_files_all")){
    log_files_all <- log_out
  } else{
    log_files_all <- bind_rows(log_files_all, log_out)
  }
}
## Reading in log file number 1 
## Reading in log file number 2 
## Reading in log file number 3 
## Reading in log file number 4 
## Reading in log file number 5 
## Reading in log file number 6
# rm(log_out); rm(log_files_all)
# head(log_out)
# View(log_files_all)


# Factor by colors and pairs of MCs
log_files_all$MC_ORDER <- factor(log_files_all$MC, levels = mc_ids)
mc_col <- c("#d7301f", "#4a1486", "#9e9ac8", "#fc8d59", "#2171b5", "#6baed6")
names(mc_col) <- mc_ids

Graph microcolonizer temperatures.

ggplot(log_files_all, aes(x = Parsed_time_EST, y = Value, group = as.factor(MC_ORDER), color = as.factor(MC_ORDER))) +
  geom_path() +
  scale_color_manual(values = mc_col) +
  theme_classic(base_size = 14) +
  labs(x = "", y = "Temperature •C") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        legend.title = element_blank())

7.2 Temperature data

ggplot(log_files_all, aes(x = Parsed_time_EST, y = Value, color = as.factor(MC_ORDER))) +
  geom_step() +
  scale_color_manual(values = mc_col) +
  theme_classic(base_size = 14) +
  labs(x = "", y = "Temperature •C") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        legend.title = element_blank())

# ?geom_smooth

8 Session end

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS/LAPACK: /Users/sarahhu/anaconda3/envs/r_4.1/lib/libopenblasp-r0.3.15.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggupset_0.3.0   lubridate_1.8.0 plotly_4.10.0   decontam_1.12.0
##  [5] phyloseq_1.36.0 cowplot_1.1.1   forcats_0.5.1   stringr_1.4.0  
##  [9] dplyr_1.0.9     purrr_0.3.4     readr_2.1.2     tidyr_1.2.0    
## [13] tibble_3.1.7    ggplot2_3.3.6   tidyverse_1.3.1
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-3       ellipsis_0.3.2         XVector_0.32.0        
##   [4] fs_1.5.2               rstudioapi_0.13        farver_2.1.0          
##   [7] fansi_1.0.3            xml2_1.3.3             codetools_0.2-18      
##  [10] splines_4.1.0          robustbase_0.93-9      cachem_1.0.6          
##  [13] knitr_1.39             ade4_1.7-18            jsonlite_1.8.0        
##  [16] broom_1.0.0            cluster_2.1.3          dbplyr_2.2.1          
##  [19] compiler_4.1.0         httr_1.4.3             backports_1.4.1       
##  [22] assertthat_0.2.1       Matrix_1.4-1           fastmap_1.1.0         
##  [25] lazyeval_0.2.2         cli_3.3.0              htmltools_0.5.2       
##  [28] tools_4.1.0            igraph_1.3.1           gtable_0.3.0          
##  [31] glue_1.6.2             GenomeInfoDbData_1.2.6 reshape2_1.4.4        
##  [34] Rcpp_1.0.8             Biobase_2.52.0         cellranger_1.1.0      
##  [37] jquerylib_0.1.4        vctrs_0.4.1            Biostrings_2.60.2     
##  [40] rhdf5filters_1.4.0     multtest_2.48.0        ape_5.6-2             
##  [43] nlme_3.1-158           iterators_1.0.13       crosstalk_1.2.0       
##  [46] tensorA_0.36.2         xfun_0.29              rvest_1.0.2           
##  [49] lifecycle_1.0.1        DEoptimR_1.0-10        zlibbioc_1.38.0       
##  [52] MASS_7.3-57            scales_1.2.0           hms_1.1.1             
##  [55] parallel_4.1.0         biomformat_1.20.0      rhdf5_2.36.0          
##  [58] RColorBrewer_1.1-3     yaml_2.3.5             sass_0.4.0            
##  [61] stringi_1.7.5          highr_0.9              S4Vectors_0.30.2      
##  [64] foreach_1.5.1          permute_0.9-5          BiocGenerics_0.38.0   
##  [67] GenomeInfoDb_1.28.4    compositions_2.0-4     rlang_1.0.4           
##  [70] pkgconfig_2.0.3        bitops_1.0-7           evaluate_0.15         
##  [73] lattice_0.20-45        Rhdf5lib_1.14.2        htmlwidgets_1.5.4     
##  [76] labeling_0.4.2         tidyselect_1.1.2       plyr_1.8.7            
##  [79] magrittr_2.0.3         R6_2.5.1               IRanges_2.26.0        
##  [82] generics_0.1.3         DBI_1.1.2              pillar_1.7.0          
##  [85] haven_2.4.3            withr_2.5.0            mgcv_1.8-36           
##  [88] survival_3.3-1         RCurl_1.98-1.3         bayesm_3.1-4          
##  [91] modelr_0.1.8           crayon_1.5.1           utf8_1.2.2            
##  [94] tzdb_0.2.0             rmarkdown_2.13         grid_4.1.0            
##  [97] readxl_1.3.1           data.table_1.14.0      vegan_2.5-7           
## [100] reprex_2.0.1           digest_0.6.29          stats4_4.1.0          
## [103] munsell_0.5.0          viridisLite_0.4.0      bslib_0.4.0